library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4 ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(WGCNA)
Loading required package: dynamicTreeCut
Loading required package: fastcluster
Attaching package: ‘fastcluster’
The following object is masked from ‘package:stats’:
hclust
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'rmarkdown':
method from
print.paged_df
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘WGCNA’
The following object is masked from ‘package:stats’:
cor
library(gplots)
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
options(stringsAsFactors = FALSE)
load sample info
sample.description <- read.csv("../input/sample.description.csv")
load reads
lcpm <- read.csv("../output/log2cpm.csv.gz", row.names = 1, check.names = FALSE)
head(lcpm)
dim(lcpm)
[1] 26704 48
Filter for genes with the highest coefficient of variation
CV <- apply(lcpm, 1, \(x) abs(sd(x)/mean(x)))
hist(log10(CV))
names(CV) <- rownames(lcpm)
CV[str_detect(names(CV), "18G076300|33G031700")]
Ceric.18G076300.v2.1 Ceric.33G031700.v2.1
0.1026816 0.2265241
quantile(CV, 0.24)
24%
0.1016702
LFY2 has a pretty low CV; have to include 76% of the genes by CV to include it.
lcpm.filter <- lcpm[CV > quantile(CV, 0.24),]
dim(lcpm.filter)
[1] 20295 48
WGCNA wants genes in columns
lcpm.filter.t <- t(lcpm.filter)
Start by including all samples
Soft thresholding
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(lcpm.filter.t, powerVector = powers, verbose = 5,networkType = "signed hybrid", blockSize = 21000)
pickSoftThreshold: calculating connectivity for given powers...
..working on genes 1 through 20295 of 20295
Warning: executing %dopar% sequentially: no parallel backend registered
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 <- 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
choose 5
softPower <- 5
adjacency <- adjacency(lcpm.filter.t, power = softPower, type = "signed hybrid")
# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency, TOMType = "signed");
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
dissTOM <- 1-TOM
# Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
define modules
# We like large modules, so we set the minimum module size relatively high:
minModuleSize <- 30;
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit <- 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
..done.
table(dynamicMods)
dynamicMods
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
3985 1679 1409 1303 1203 1073 879 852 668 523 494 474 463 432 418 398 394 375 330 324
21 22 23 24 25 26 27 28 29 30 31 32 33 34
304 301 268 230 224 195 175 171 164 148 135 131 105 68
# Convert numeric labels into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
dynamicColors
black blue brown cyan darkgreen darkgrey
879 1679 1409 432 301 230
darkmagenta darkolivegreen darkorange darkred darkturquoise green
68 105 195 304 268 1203
greenyellow grey60 lightcyan lightgreen lightyellow magenta
494 394 398 375 330 668
midnightblue orange paleturquoise pink purple red
418 224 135 852 523 1073
royalblue saddlebrown salmon skyblue steelblue tan
324 164 463 171 148 474
turquoise violet white yellow
3985 131 175 1303
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
merge similar modules
# Calculate eigengenes
MEList <- moduleEigengenes(lcpm.filter.t, colors = dynamicColors)
MEs <- MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss <- 1-cor(MEs);
# Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
merge with correlation > 0.75 (Change from default of 0.8 / 0.2())
MEDissThres = 0.25
# Plot the cut line into the dendrogram
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(lcpm.filter.t, dynamicColors, cutHeight = MEDissThres, verbose = 3)
mergeCloseModules: Merging modules whose distance is less than 0.25
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 34 module eigengenes in given set.
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 26 module eigengenes in given set.
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 25 module eigengenes in given set.
Calculating new MEs...
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 25 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
compare pre and post merge
sizeGrWindow(12, 9)
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
#dev.off()
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs
table(merge$colors)
black blue brown cyan darkgreen darkgrey darkmagenta
879 2994 2612 432 406 230 68
darkorange darkred darkturquoise greenyellow grey60 lightcyan lightgreen
195 304 492 494 394 398 375
lightyellow magenta midnightblue paleturquoise purple saddlebrown skyblue
330 1741 418 459 523 312 171
tan turquoise violet white
474 5288 131 175
length(table(merge$colors))
[1] 25
median(table(merge$colors))
[1] 406
Which module is LFY in?
CrLFY1 <- "Ceric.33G031700"
CrLFY2 <- "Ceric.18G076300"
module.assignment <- tibble(geneID=colnames(lcpm.filter.t), module = mergedColors)
module.assignment %>%
filter(str_detect(geneID, "18G076300|33G031700"))
Interesting: they are in different modules(!).
module.assignment %>% group_by(module) %>% summarize(n_genes = n()) %>% arrange(n_genes)
Plot eigengenes
Make sure sample info sheet is in the correct order.
rownames(lcpm.filter.t) %>% str_replace_all("\\.", "-") == sample.description$sample
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[39] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
sample.eigen <- cbind(sample.description, MEs)
sample.eigen
sample.eigen %>%
mutate(gt_tissue=str_c(tissue, "-", base_gt)) %>%
pivot_longer(starts_with("ME"), names_to = "ME") %>%
ggplot(aes(x=gt_tissue, y = value, color = tissue)) +
geom_point() +
facet_wrap(~ME, ncol=5) +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) +
scale_color_brewer(type="qual", palette = "Set3")
A heat map:
MEs.m <- as.matrix(MEs)
heatmap.2(MEs.m, trace="none", cexRow= 0.6, col="bluered")
save(module.assignment, MEs, lcpm.filter, CrLFY1, CrLFY2, file="../output/WGCNA_allSamples.Rdata")